Add an `ostree-boot-complete.service` to propagate staging failures
authorColin Walters <walters@verbum.org>
Fri, 22 Apr 2022 22:46:28 +0000 (18:46 -0400)
committerColin Walters <walters@verbum.org>
Tue, 26 Apr 2022 17:02:46 +0000 (13:02 -0400)
Quite a while ago we added staged deployments, which solved
a bunch of issues around the `/etc` merge.  However...a persistent
problem since then is that any failures in that process that
happened in the *previous* boot are not very visible.

We ship custom code in `rpm-ostree status` to query the previous
journal.  But that has a few problems - one is that on systems
that have been up a while, that failure message may even get
rotated out.  And second, some systems may not even have a persistent
journal at all.

A general thing we do in e.g. Fedora CoreOS testing is to check
for systemd unit failures.  We do that both in our automated tests,
and we even ship code that displays them on ssh logins.  And beyond
that obviously a lot of other projects do the same; it's easy via
`systemctl --failed`.

So to make failures more visible, change our `ostree-finalize-staged.service`
to have an internal wrapper around the process that "catches" any
errors, and copies the error message into a file in `/boot/ostree`.

Then, a new `ostree-boot-complete.service` looks for this file on
startup and re-emits the error message, and fails.

It also deletes the file.  The rationale is to avoid *continually*
warning.  For example we need to handle the case when an upgrade
process creates a new staged deployment.  Now, we could change the
ostree core code to delete the warning file when that happens instead,
but this is trying to be a conservative change.

This should make failures here much more visible as is.

13 files changed:
Makefile-boot.am
Makefile-ostree.am
src/boot/ostree-boot-complete.service [new file with mode: 0644]
src/libostree/ostree-cmdprivate.c
src/libostree/ostree-cmdprivate.h
src/libostree/ostree-impl-system-generator.c
src/libostree/ostree-sysroot-deploy.c
src/libostree/ostree-sysroot-private.h
src/libostree/ostree-sysroot.c
src/ostree/ot-admin-builtin-boot-complete.c [new file with mode: 0644]
src/ostree/ot-admin-builtins.h
src/ostree/ot-builtin-admin.c
tests/kolainst/destructive/staged-deploy.sh

index ec10a0d6496ee1c83a7d3bc805585ed9a2a477ee..e42e5180bfaee647f933156cdd5157059a0831ca 100644 (file)
@@ -38,6 +38,7 @@ endif
 if BUILDOPT_SYSTEMD
 systemdsystemunit_DATA = src/boot/ostree-prepare-root.service \
        src/boot/ostree-remount.service \
+       src/boot/ostree-boot-complete.service \
        src/boot/ostree-finalize-staged.service \
        src/boot/ostree-finalize-staged.path \
        $(NULL)
@@ -64,6 +65,7 @@ endif
 EXTRA_DIST += src/boot/dracut/module-setup.sh \
        src/boot/dracut/ostree.conf \
        src/boot/mkinitcpio \
+       src/boot/ostree-boot-complete.service \
        src/boot/ostree-prepare-root.service \
        src/boot/ostree-finalize-staged.path \
        src/boot/ostree-remount.service \
index 82af16813fe68094abe616bb229c9359ffe026c2..0fe2c5f86c65c4516e808d9c6f19b8729c8a1cbc 100644 (file)
@@ -70,6 +70,7 @@ ostree_SOURCES += \
        src/ostree/ot-admin-builtin-diff.c \
        src/ostree/ot-admin-builtin-deploy.c \
        src/ostree/ot-admin-builtin-finalize-staged.c \
+       src/ostree/ot-admin-builtin-boot-complete.c \
        src/ostree/ot-admin-builtin-undeploy.c \
        src/ostree/ot-admin-builtin-instutil.c \
        src/ostree/ot-admin-builtin-cleanup.c \
diff --git a/src/boot/ostree-boot-complete.service b/src/boot/ostree-boot-complete.service
new file mode 100644 (file)
index 0000000..5c09fdc
--- /dev/null
@@ -0,0 +1,33 @@
+# Copyright (C) 2022 Red Hat, Inc.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <https://www.gnu.org/licenses/>.
+
+[Unit]
+Description=OSTree Complete Boot
+Documentation=man:ostree(1)
+# For now, this is the only condition on which we start, but it's
+# marked as a triggering condition in case in the future we want
+# to do something else.
+ConditionPathExists=|/boot/ostree/finalize-failure.stamp
+RequiresMountsFor=/boot
+# Ensure that we propagate the failure into the current boot before
+# any further finalization attempts.
+Before=ostree-finalize-staged.service
+
+[Service]
+Type=oneshot
+# To write to /boot while keeping it read-only
+MountFlags=slave
+RemainAfterExit=yes
+ExecStart=/usr/bin/ostree admin boot-complete
index c9a6e2e13a1c21ff839210a1c55310516e391797..f6c114f4802124b0e4f9d08b118a20b94ac76f96 100644 (file)
@@ -51,6 +51,7 @@ ostree_cmd__private__ (void)
     _ostree_repo_static_delta_delete,
     _ostree_repo_verify_bindings,
     _ostree_sysroot_finalize_staged,
+    _ostree_sysroot_boot_complete,
   };
 
   return &table;
index 46452ebd60b83efc13ae1d281714853064a6d312..17f943c8b21cf5a7d944403a092f98ca12740298 100644 (file)
@@ -33,6 +33,7 @@ typedef struct {
   gboolean (* ostree_static_delta_delete) (OstreeRepo *repo, const char *delta_id, GCancellable *cancellable, GError **error);
   gboolean (* ostree_repo_verify_bindings) (const char *collection_id, const char *ref_name, GVariant *commit, GError **error);
   gboolean (* ostree_finalize_staged) (OstreeSysroot *sysroot, GCancellable *cancellable, GError **error);
+  gboolean (* ostree_boot_complete) (OstreeSysroot *sysroot, GCancellable *cancellable, GError **error);
 } OstreeCmdPrivateVTable;
 
 /* Note this not really "public", we just export the symbol, but not the header */
index 769f0cbdafd2c40b8bcc14bbbf48110c6b2c0dab..92d71605f97c5fbb1074341fd40da43110b376b1 100644 (file)
@@ -134,6 +134,8 @@ require_internal_units (const char *normal_dir,
     return FALSE;
   if (symlinkat (SYSTEM_DATA_UNIT_PATH "/ostree-finalize-staged.path", normal_dir_dfd, "multi-user.target.wants/ostree-finalize-staged.path") < 0)
     return glnx_throw_errno_prefix (error, "symlinkat");
+  if (symlinkat (SYSTEM_DATA_UNIT_PATH "/ostree-boot-complete.service", normal_dir_dfd, "multi-user.target.wants/ostree-boot-complete.service") < 0)
+    return glnx_throw_errno_prefix (error, "symlinkat");
 
   return TRUE;
 #else
index 5219e2a4f958f8e53f558ab79785a77981a40a8c..404f336fb5009ac46e54f1e43f0c56c83ef89922 100644 (file)
@@ -3375,10 +3375,10 @@ ostree_sysroot_stage_tree_with_options (OstreeSysroot     *self,
 }
 
 /* Invoked at shutdown time by ostree-finalize-staged.service */
-gboolean
-_ostree_sysroot_finalize_staged (OstreeSysroot *self,
-                                 GCancellable  *cancellable,
-                                 GError       **error)
+static gboolean
+_ostree_sysroot_finalize_staged_inner (OstreeSysroot *self,
+                                       GCancellable  *cancellable,
+                                       GError       **error)
 {
   /* It's totally fine if there's no staged deployment; perhaps down the line
    * though we could teach the ostree cmdline to tell systemd to activate the
@@ -3475,9 +3475,63 @@ _ostree_sysroot_finalize_staged (OstreeSysroot *self,
   if (!ostree_sysroot_prepare_cleanup (self, cancellable, error))
     return FALSE;
 
+  // Cleanup will have closed some FDs, re-ensure writability
+  if (!_ostree_sysroot_ensure_writable (self, error))
+    return FALSE;
+
   return TRUE;
 }
 
+/* Invoked at shutdown time by ostree-finalize-staged.service */
+gboolean
+_ostree_sysroot_finalize_staged (OstreeSysroot *self,
+                                 GCancellable  *cancellable,
+                                 GError       **error)
+{
+  g_autoptr(GError) finalization_error = NULL;
+  if (!_ostree_sysroot_ensure_boot_fd (self, error))
+    return FALSE;
+  if (!_ostree_sysroot_finalize_staged_inner (self, cancellable, &finalization_error))
+    {
+      g_autoptr(GError) writing_error = NULL;
+      g_assert_cmpint (self->boot_fd, !=, -1);
+      if (!glnx_file_replace_contents_at (self->boot_fd, _OSTREE_FINALIZE_STAGED_FAILURE_PATH, 
+                                           (guint8*)finalization_error->message, -1,
+                                           0, cancellable, &writing_error))
+        {
+          // We somehow failed to write the failure message...that's not great.  Maybe ENOSPC on /boot.
+          g_printerr ("Failed to write %s: %s\n", _OSTREE_FINALIZE_STAGED_FAILURE_PATH, writing_error->message);
+        }
+      g_propagate_error (error, g_steal_pointer (&finalization_error));
+      return FALSE;
+    }
+  return TRUE;
+}
+
+/* Invoked at bootup time by ostree-boot-complete.service */
+gboolean
+_ostree_sysroot_boot_complete (OstreeSysroot *self,
+                               GCancellable  *cancellable,
+                               GError       **error)
+{
+  if (!_ostree_sysroot_ensure_boot_fd (self, error))
+    return FALSE;
+
+  glnx_autofd int failure_fd = -1;
+  if (!ot_openat_ignore_enoent (self->boot_fd, _OSTREE_FINALIZE_STAGED_FAILURE_PATH, &failure_fd, error))
+    return FALSE;
+  // If we didn't find a failure log, then there's nothing to do right now.
+  // (Actually this unit shouldn't even be invoked, but we may do more in the future)
+  if (failure_fd == -1)
+    return TRUE;
+  g_autofree char *failure_data = glnx_fd_readall_utf8 (failure_fd, NULL, cancellable, error);
+  if (failure_data == NULL)
+    return glnx_prefix_error (error, "Reading from %s", _OSTREE_FINALIZE_STAGED_FAILURE_PATH);
+  // Remove the file; we don't want to continually error out.
+  (void) unlinkat (self->boot_fd, _OSTREE_FINALIZE_STAGED_FAILURE_PATH, 0);
+  return glnx_throw (error, "ostree-finalize-staged.service failed on previous boot: %s", failure_data);
+}
+
 /**
  * ostree_sysroot_deployment_set_kargs:
  * @self: Sysroot
index cb34eeb3ac23ae90ae52af5b3690da14345159d2..a49a406cf346a249a724d85039741e8663b227f6 100644 (file)
@@ -96,6 +96,9 @@ struct OstreeSysroot {
 #define _OSTREE_SYSROOT_BOOT_INITRAMFS_OVERLAYS "ostree/initramfs-overlays"
 #define _OSTREE_SYSROOT_INITRAMFS_OVERLAYS "boot/" _OSTREE_SYSROOT_BOOT_INITRAMFS_OVERLAYS
 
+// Relative to /boot, consumed by ostree-boot-complete.service
+#define _OSTREE_FINALIZE_STAGED_FAILURE_PATH "ostree/finalize-failure.stamp"
+
 gboolean
 _ostree_sysroot_ensure_writable (OstreeSysroot      *self,
                                  GError            **error);
@@ -142,6 +145,10 @@ gboolean
 _ostree_sysroot_finalize_staged (OstreeSysroot *self,
                                  GCancellable  *cancellable,
                                  GError       **error);
+gboolean
+_ostree_sysroot_boot_complete (OstreeSysroot *self,
+                               GCancellable  *cancellable,
+                               GError       **error);
 
 OstreeDeployment *
 _ostree_sysroot_deserialize_deployment_from_variant (GVariant *v,
index 0e0521dae5c50a5971b24bfe9000b4411dbec98d..67afb5d25097ede5d51bee24d7eff80ee56e3d10 100644 (file)
@@ -357,6 +357,8 @@ _ostree_sysroot_ensure_writable (OstreeSysroot      *self,
   ostree_sysroot_unload (self);
   if (!ensure_sysroot_fd (self, error))
     return FALSE;
+  if (!_ostree_sysroot_ensure_boot_fd (self, error))
+    return FALSE;
 
   return TRUE;
 }
diff --git a/src/ostree/ot-admin-builtin-boot-complete.c b/src/ostree/ot-admin-builtin-boot-complete.c
new file mode 100644 (file)
index 0000000..6e1052f
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: LGPL-2.0+
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "ot-main.h"
+#include "ot-admin-builtins.h"
+#include "ot-admin-functions.h"
+#include "ostree.h"
+#include "otutil.h"
+
+#include "ostree-cmdprivate.h"
+
+static GOptionEntry options[] = {
+  { NULL }
+};
+
+gboolean
+ot_admin_builtin_boot_complete (int argc, char **argv, OstreeCommandInvocation *invocation, GCancellable *cancellable, GError **error)
+{
+  /* Just a sanity check; we shouldn't be called outside of the service though.
+   */
+  struct stat stbuf;
+  if (fstatat (AT_FDCWD, OSTREE_PATH_BOOTED, &stbuf, 0) < 0)
+    return TRUE;
+  // We must have been invoked via systemd which should have set up a mount namespace.
+  g_assert (getenv ("INVOCATION_ID"));
+
+  g_autoptr(GOptionContext) context = g_option_context_new ("");
+  g_autoptr(OstreeSysroot) sysroot = NULL;
+  if (!ostree_admin_option_context_parse (context, options, &argc, &argv,
+                                          OSTREE_ADMIN_BUILTIN_FLAG_SUPERUSER,
+                                          invocation, &sysroot, cancellable, error))
+    return FALSE;
+
+  if (!ostree_cmd__private__()->ostree_boot_complete (sysroot, cancellable, error))
+    return FALSE;
+
+  return TRUE;
+}
index d32b617e2b0cd7b79a05874b36bff263c3a7574a..8d9451bec6f86825d2290cdf6a340484f88a2ff1 100644 (file)
@@ -39,6 +39,7 @@ BUILTINPROTO(deploy);
 BUILTINPROTO(cleanup);
 BUILTINPROTO(pin);
 BUILTINPROTO(finalize_staged);
+BUILTINPROTO(boot_complete);
 BUILTINPROTO(unlock);
 BUILTINPROTO(status);
 BUILTINPROTO(set_origin);
index e0d2a60c6438c3d1397f00550ab2e1866f6522f6..af09a6141679ae3997a62f434903d847eb1da702 100644 (file)
@@ -43,6 +43,9 @@ static OstreeCommand admin_subcommands[] = {
   { "finalize-staged", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN,
     ot_admin_builtin_finalize_staged,
     "Internal command to run at shutdown time" },
+  { "boot-complete", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN,
+    ot_admin_builtin_boot_complete,
+    "Internal command to run at boot after an update was applied" },
   { "init-fs", OSTREE_BUILTIN_FLAG_NO_REPO,
      ot_admin_builtin_init_fs,
     "Initialize a root filesystem" },
index df40f115b0e54f8d04b8c0f8ea41c4c1e6adeec5..7e1991bb58ad313dfeac21ed8b4a03c2a997d9e7 100755 (executable)
@@ -146,6 +146,18 @@ EOF
     # Cleanup refs
     ostree refs --delete staged-deploy nonstaged-deploy
     echo "ok cleanup refs"
+
+    # Now finally, try breaking staged updates and verify that ostree-boot-complete fails on the next boot
+    unshare -m /bin/sh -c 'mount -o remount,rw /boot; chattr +i /boot'
+    rpm-ostree kargs --append=foo=bar
+    /tmp/autopkgtest-reboot "3"
+    ;;
+  "3") 
+    (systemctl status ostree-boot-complete.service || true) | tee out.txt
+    assert_file_has_content out.txt 'error: ostree-finalize-staged.service failed on previous boot.*Operation not permitted'
+    systemctl show -p Result ostree-boot-complete.service > out.txt
+    assert_file_has_content out.txt='Result=exit-code'
+    echo "ok boot-complete.service"
     ;;
   *) fatal "Unexpected AUTOPKGTEST_REBOOT_MARK=${AUTOPKGTEST_REBOOT_MARK}" ;;
 esac